library(pacman)
p_load(arules, arulesViz, ggplot2, tidyrules, dplyr, C50, pander)
dataset <- read.transactions("./AssociationRules.csv", sep=" ")
freq_tab <- data.frame(itemFrequency(dataset, type="absolute"))
freq_tab <- cbind(rownames(freq_tab), freq_tab)
rownames(freq_tab) <- NULL
names(freq_tab) <- c("item", "freq")
freq_tab[freq_tab$freq == max(freq_tab$freq),]
itemFrequencyPlot(dataset, type="absolute", topN=10)
summary(dataset)
transactions as itemMatrix in sparse format with
10000 rows (elements/itemsets/transactions) and
98 columns (items) and a density of 0.1000643
most frequent items:
item13 item5 item30 item10 item58 (Other)
4948 3699 3308 3035 2831 80242
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8 9 10 11 12
17 88 176 319 490 660 858 1045 1132 1120 1079 859
13 14 15 16 17 18 19 20 21 22 23 24
675 520 398 249 133 97 41 22 7 9 2 1
25
3
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 7.000 10.000 9.806 12.000 25.000
includes extended item information - examples:
rules <- apriori(data=dataset, parameter=list(support=0.01,
confidence=0,
target="rules",
minlen=2))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 100
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[98 item(s), 10000 transaction(s)] done [0.01s].
sorting and recoding items ... [89 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.02s].
writing ... [11435 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
summary(rules)
set of 11435 rules
rule length distribution (lhs + rhs):sizes
2 3 4 5
2952 7206 1272 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 3.000 2.854 3.000 5.000
summary of quality measures:
support confidence coverage lift count
Min. :0.0100 Min. :0.02183 Min. :0.01000 Min. : 0.6717 Min. : 100
1st Qu.:0.0114 1st Qu.:0.16958 1st Qu.:0.03935 1st Qu.: 1.0103 1st Qu.: 114
Median :0.0140 Median :0.25000 Median :0.06320 Median : 1.1261 Median : 140
Mean :0.0182 Mean :0.28012 Mean :0.08636 Mean : 1.2302 Mean : 182
3rd Qu.:0.0197 3rd Qu.:0.36070 3rd Qu.:0.10340 3rd Qu.: 1.2828 3rd Qu.: 197
Max. :0.1877 Max. :1.00000 Max. :0.49480 Max. :19.4205 Max. :1877
mining info:
length(rules)
[1] 11435
inspect(rules)
rules.hconf <- subset(rules, confidence >= 0.5)
length(rules.hconf)
[1] 1165
# Some 10 sample
rules.top <- sample(rules, 10)
# High confidence rules
rules.conf <- sort(rules, by="confidence", decreasing=TRUE)
# High lift rules
rules.hlift <- sort(rules, by="lift", decreasing=TRUE)
rules.llift <- sort(rules, by="lift", decreasing=FALSE)
# Top 10 rules by confidence
rules.top_conf <- head(rules.conf, n=10)
# Top 10 rules by lift
rules.top_lift = head(rules.hlift, n=10)
plot(rules,
method="scatterplot",
measure=c("support", "confidence"),
shading="lift",
jitter=0.2
)
plot(rules.hconf,
method="scatterplot",
measure=c("support", "confidence"),
shading="lift",
jitter=0.2
)
plot(rules,
method="scatterplot",
measure=c("support", "lift"),
shading="confidence",
jitter=0.2
)
plot(rules.hconf,
method="scatterplot",
measure=c("support", "lift"),
shading="confidence",
jitter=0.2
)
head(quality(rules.hconf))
rules.hsup <- subset(rules, support >= 0.1)
plot(rules.hsup,
method="scatterplot",
measure=c("support", "confidence"),
shading="lift",
engine="htmlwidget"
)
inspect(head(rules.hsup, n=3, by="confidence", decreasong=TRUE))
inspect(head(rules.hlift, 10))
plot(head(rules.hlift, 10),
method="matrix",
measure="lift",
control=list(recorder=FALSE),
engine="htmlwidget")
Unknown control parameters: recorder
Available control parameters (with default values):
interactive = TRUE
engine = htmlwidget
max = 1000
colors = c("#EE0000FF", "#EEEEEEFF")
reorder = measure
precision = 3
verbose = FALSE
inspect(head(rules.llift, 10))
plot(head(rules.llift, 10),
method="matrix",
measure="lift",
control=list(recorder=FALSE),
engine="htmlwidget")
Unknown control parameters: recorder
Available control parameters (with default values):
interactive = TRUE
engine = htmlwidget
max = 1000
colors = c("#EE0000FF", "#EEEEEEFF")
reorder = measure
precision = 3
verbose = FALSE
plot(rules,
method="scatterplot",
measure=c("support", "confidence"),
shading="lift",
engine="htmlwidget"
)
plot: Too many rules supplied. Only plotting the best 1000 rules using measure lift (change parameter max if needed)To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rules,
method="scatterplot",
measure=c("support", "lift"),
shading="confidence",
engine="htmlwidget"
)
plot: Too many rules supplied. Only plotting the best 1000 rules using measure confidence (change parameter max if needed)To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rules.top_conf, method="paracoord")
plot(rules.top_lift, method="paracoord")
plot(rules.top_lift, method="grouped")
plot(rules.top_conf, method="graph", engine="htmlwidget")
top_3_lift = head(rules.top_lift, n=3)
plot(top_3_lift, method="graph")
rules.hight_trustly <- subset(rules, confidence >= 0.8)
first_38_conf = head(sort(rules.hight_trustly,
by="lift",
decreasing=TRUE), n=40)
plot(rules.hight_trustly,
method="matrix",
shading=c("lift", "confidence"),
measure=c("lift", "confidence"),
control=list(reorder=FALSE)
)
Itemsets in Antecedent (LHS)
[1] "{item55}" "{item83}" "{item23}" "{item10,item44}"
[5] "{item20,item23}" "{item23,item5}" "{item49,item56}" "{item15,item49}"
[9] "{item82,item99}" "{item15,item49,item56}" "{item30,item49,item56}" "{item15,item30,item49}"
[13] "{item49,item56,item84}" "{item30,item49,item84}" "{item15,item49,item84}" "{item49,item77,item84}"
[17] "{item5,item82,item99}" "{item13,item82,item99}" "{item15,item56,item77}" "{item30,item56,item77}"
[21] "{item15,item56,item84}" "{item15,item30,item56}" "{item22,item3,item41}" "{item10,item22,item41}"
[25] "{item25,item34,item77}" "{item16,item34,item77}" "{item20,item25,item41}" "{item16,item25,item77}"
[29] "{item16,item61,item77}" "{item30,item95,item96}" "{item3,item84,item95}"
Itemsets in Consequent (RHS)
[1] "{item34}" "{item13}" "{item15}" "{item56}" "{item84}" "{item30}" "{item5}" "{item77}" "{item10}"
[10] "{item3}" "{item92}"
dataset.train <- dataset[1:8000,]
dataset.test <- dataset[8001:10000,]
model <- apriori(data=dataset.train, parameter=list(support=0.01,
confidence=0.1,
target="rules",
minlen=2))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 80
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[98 item(s), 8000 transaction(s)] done [0.01s].
sorting and recoding items ... [89 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.02s].
writing ... [10747 rule(s)] done [0.00s].
creating S4 object ... done [0.01s].
validate_model <- apriori(data=dataset.test, parameter=list(support=0.01,
confidence=0.1,
target="rules",
minlen=2))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 20
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[98 item(s), 2000 transaction(s)] done [0.00s].
sorting and recoding items ... [89 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.01s].
writing ... [12238 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
train_df <- DATAFRAME(model)
validate_df <- DATAFRAME(validate_model)
cat('Support train:\t', mean(train_df$support), '\n')
Support train: 0.01837056
cat('Confidence train:\t', mean(train_df$confidence), '\n')
Confidence train: 0.2964712
cat('Support validate:\t', mean(validate_df$support), '\n')
Support validate: 0.01776226
cat('Confidence validate:\t', mean(validate_df$confidence), '\n')
Confidence validate: 0.3081568